#!/bin/bash
#==========  LSF 选项  ==========
#BSUB -q fat_768
#BSUB -n 80
#BSUB -R "span[hosts=1]"
#BSUB -J ppo_cpu_chatglm3
#BSUB -o /share/home/zhangshanqi/QSM/cpu/trl/logs/ppo_cpu_chatglm3.%J.out
#BSUB -e /share/home/zhangshanqi/QSM/cpu/trl/logs/ppo_cpu_chatglm3.%J.err

#==========  运行环境  ==========
cd /share/home/zhangshanqi/QSM/cpu/trl
source /share/home/zhangshanqi/QSM/RLHF/miniconda3/etc/profile.d/conda.sh
conda activate qsm

#==========  线程与亲和性  ==========
export OMP_NUM_THREADS=40
export MKL_NUM_THREADS=40
export NUMEXPR_NUM_THREADS=40
export OPENBLAS_NUM_THREADS=40
export TOKENIZERS_PARALLELISM=false
export CUDA_VISIBLE_DEVICES=""
export KMP_BLOCKTIME=1
export KMP_AFFINITY=granularity=fine,compact,1,0

#==========  关闭外网探测  ==========
export HF_HUB_OFFLINE=1
export HF_HUB_DISABLE_TELEMETRY=1

#==========  启动 PPO 训练  ==========
accelerate launch \
  --config_file /share/home/zhangshanqi/QSM/cpu/trl/default_config.yaml \
  --mixed_precision no \
  --num_processes 1 \
  examples/scripts/ppo/ppo.py \
  --dataset_name csv \
  --dataset_train_split train \
  --learning_rate 1e-6 \
  --num_ppo_epochs 3 \
  --num_mini_batches 1 \
  --output_dir "/scratch/$USER/专家01ppo01" \
  --per_device_train_batch_size 1 \
  --gradient_accumulation_steps 8 \
  --total_episodes 3000 \
  --model_name_or_path "/scratch/$USER/chatglm3_6b" \
  --sft_model_path     "/scratch/$USER/chatglm3_6b" \
  --reward_model_path  "/scratch/$USER/专家01reward_model_pairwise_fp16" \
  --missing_eos_penalty 1.0 \
  --max_grad_norm 1.0 \
  --cliprange 0.2 \
  --cliprange_value 0.2 \
  --whiten_rewards True \
  --vf_coef 0.1

